{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "from requests_html import HTMLSession\n",
    "import requests_html\n",
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-2-d389e8bbeddb>:14: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts)\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "#不要直接click（）等相关操作，首先要检查是否通过xpath接的正确的element\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"1327471020@qq.com\", \"password\": \"123456789wu*\"}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 填写账号信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//input[@name=\"account\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//input[@name=\"password\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//a[@class=\"btn_login\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 点击图文素材"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击新建图文素材\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div[3]/div/div/div[2]/div[2]/div[3]/div[2]/div/div[1]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 检查窗口信息后，切换窗口"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-F37EE0721127B90883C55824C312468C',\n",
       " 'CDwindow-07F56E9EB2ED1737B2A1BDFED1C3E90E']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 新建图文消息开了另一分视窗，所以要切换 switch_to \n",
    "driver.switch_to.window(driver.window_handles[-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 点击超链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('/html/body/div[1]/div/div/div/div[2]/div[2]/div/ul[2]/li[1]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击选择其他公众号\n",
    "element=driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/p/div/button')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 选择所需公众号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#选择所需公众号\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys('回形针PaperClip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "#点击放大镜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/SlOqFKqEO4ELATX8URJMElMzLgicN6ulppYY2pSNER8ObJUUL4H2erBBp39bgc8VafsVUX1I6bPEqJDtC965s9w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">回形针PaperClip</strong> <i class=\"inner_link_account_wechat\">微信号：papercliptv</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/xq7gvyZBtmofrXeTQVtnPJXTzI315CbgqA9kuxGR3iaUfhRLGxNQVCIm98WAkXK0LqYl05kYkLEeT6ibp6JjomGQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">回形针PaperClip视频</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/sz_mmbiz_png/8piaGVjBatGpSQO4L7qABzFOicaV7coWwVdoo5xJgo9PC7zOJaEuiap8QyfFQiaHz2D7XPZ2chdhibgiasn0IRl6tZbA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">回形针PaperClip播放</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/kxpiabSRGvslWddMNvwgLn36Co46DlxbDNfr0lGAWyeMO7xXXZhD5z6ibL43m8mkPDoE04eTeKElMkzlBSeeiawIA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">考研回形针</strong> <i class=\"inner_link_account_wechat\">微信号：kyhuixing</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/wOHDTHe1XEJRe0o6jk0IwRLmsr0INcQp8baqAB9ghLSrQ4u9WUIVaRV4exDT2ia2lf5qfnTiaYibn9ia10cGlYH6yQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">历史回形针</strong> <i class=\"inner_link_account_wechat\">微信号：qiantan2016</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>回形针PaperClip</td>\n",
       "      <td>微信号：papercliptv</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/SlOqFKqEO4ELATX...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>回形针PaperClip视频</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/xq7gvyZBtmofrXe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>回形针PaperClip播放</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/sz_mmbiz_png/8piaGVjBatGp...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>考研回形针</td>\n",
       "      <td>微信号：kyhuixing</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/kxpiabSRGvslWdd...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>历史回形针</td>\n",
       "      <td>微信号：qiantan2016</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/wOHDTHe1XEJRe0o...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         nickname           wechat  \\\n",
       "0    回形针PaperClip  微信号：papercliptv   \n",
       "1  回形针PaperClip视频          微信号：未设置   \n",
       "2  回形针PaperClip播放          微信号：未设置   \n",
       "3           考研回形针    微信号：kyhuixing   \n",
       "4           历史回形针  微信号：qiantan2016   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/SlOqFKqEO4ELATX...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/xq7gvyZBtmofrXe...  \n",
       "2  http://mmbiz.qpic.cn/sz_mmbiz_png/8piaGVjBatGp...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/kxpiabSRGvslWdd...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/wOHDTHe1XEJRe0o...  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/SlOqFKqEO4ELATX8URJMElMzLgicN6ulppYY2pSNER8ObJUUL4H2erBBp39bgc8VafsVUX1I6bPEqJDtC965s9w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">回形针PaperClip</strong> <i class=\"inner_link_account_wechat\">微信号：papercliptv</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 63]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 循环遍历"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(1+10*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 查看页数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t56\t57\t58\t59\t60\t61\t62\t63\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>63 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "..                                                ...\n",
       "59  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "60  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "61  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "62  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "63  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "\n",
       "[63 rows x 1 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "61\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>63</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "63  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12, 63]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12, 63]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 爬取公众号数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "公众号 = \"回形针PaperClip\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]/span[2]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    text = [get_text(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\":create_time, \"link\":link, \"text\":text})\n",
    "    return(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_text(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    text_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    text_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    text_1 = ''.join(r.html.xpath(text_xpath_1))\n",
    "    text_2 = ''.join(r.html.xpath(text_xpath_2))\n",
    "    return text_1 + text_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7,9,10,7,9,8,6,7,8,8,7,9,6,8,10,8,11,7,8,9,8,8,7,8,8,7,7,10,8,10,8,8,7,10,8,10,10,8,10,8,9,9,9,8,8,10,11,9,7,5,6,8,5,5,5,5,5,7,6,5,5,5,5,"
     ]
    }
   ],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>你喝的牛奶从哪来丨回形针</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>\\n展开\\n\\n\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Vol.176 你喝的牛奶从哪来丨视频文字稿</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>牛奶、羊奶、牦牛奶、骆驼奶甚至还有驴奶，这么多奶，为什么偏偏牛奶能够成为国民奶？对比一下营养...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>懒人在音乐、断食、电击中书写 blog丨有点东西</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>「好看」：配色和排版非常舒服。能量代谢局部图（左），发光生物图鉴（右）「科学」：为了展示病毒...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>机器学习的最底层，或许是人类丨回形针</td>\n",
       "      <td>2021-05-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在今天的这篇推送里，我们希望和你简单聊聊人工智能背后的「隐形」人工 —— 在了解机器识别技术...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>快出门，今晚有超级月全食丨回形针</td>\n",
       "      <td>2021-05-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>2021 年 5 月 26 日，是一个值得你提前下班放学，对着天空端好望远镜和相机的日子。从...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>482</th>\n",
       "      <td>如何拯救秃顶 | 回形针</td>\n",
       "      <td>2018-05-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在高水平的雄性激素影响下，几乎所有男性都会有不同程度的脱发。早在 2011 年，就有数据显示...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>坦克驾驶指南 | 回形针</td>\n",
       "      <td>2018-04-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>哪个男孩不想开坦克呢？在柏林郊区 Beerfelde，160 欧可以让你开着 T55 坦克在...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>混音师如何拯救歌手 | 回形针</td>\n",
       "      <td>2018-04-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在中文互联网，「百万年薪调音师」是一个流传多年的传说。但完成视频里这些神奇操作的并不是调音师...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>跑车凭什么这么贵 | 回形针</td>\n",
       "      <td>2018-04-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>想买一辆顶级跑车，一千万是起步价。而 2017 年全球最畅销的中国品牌车五菱宏光，顶配也不过...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>486</th>\n",
       "      <td>我们正在寻找动效设计师 | 回形针</td>\n",
       "      <td>2018-04-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>这里是回形针，你的当代生活说明书。如果你喜欢我们的内容，也有兴趣和我们一起工作，那么不妨看看...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>487 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        title create_time  \\\n",
       "0                你喝的牛奶从哪来丨回形针  2021-05-28   \n",
       "1      Vol.176 你喝的牛奶从哪来丨视频文字稿  2021-05-28   \n",
       "2    懒人在音乐、断食、电击中书写 blog丨有点东西  2021-05-29   \n",
       "3          机器学习的最底层，或许是人类丨回形针  2021-05-27   \n",
       "4            快出门，今晚有超级月全食丨回形针  2021-05-26   \n",
       "..                        ...         ...   \n",
       "482              如何拯救秃顶 | 回形针  2018-05-09   \n",
       "483              坦克驾驶指南 | 回形针  2018-04-27   \n",
       "484           混音师如何拯救歌手 | 回形针  2018-04-11   \n",
       "485            跑车凭什么这么贵 | 回形针  2018-04-04   \n",
       "486         我们正在寻找动效设计师 | 回形针  2018-04-03   \n",
       "\n",
       "                                                  link  \\\n",
       "0    http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "1    http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "2    http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "3    http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "4    http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "..                                                 ...   \n",
       "482  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "483  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "484  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "485  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "486  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "\n",
       "                                                  text  \n",
       "0                                           \\n展开\\n\\n\\n  \n",
       "1    牛奶、羊奶、牦牛奶、骆驼奶甚至还有驴奶，这么多奶，为什么偏偏牛奶能够成为国民奶？对比一下营养...  \n",
       "2    「好看」：配色和排版非常舒服。能量代谢局部图（左），发光生物图鉴（右）「科学」：为了展示病毒...  \n",
       "3    在今天的这篇推送里，我们希望和你简单聊聊人工智能背后的「隐形」人工 —— 在了解机器识别技术...  \n",
       "4    2021 年 5 月 26 日，是一个值得你提前下班放学，对着天空端好望远镜和相机的日子。从...  \n",
       "..                                                 ...  \n",
       "482  在高水平的雄性激素影响下，几乎所有男性都会有不同程度的脱发。早在 2011 年，就有数据显示...  \n",
       "483  哪个男孩不想开坦克呢？在柏林郊区 Beerfelde，160 欧可以让你开着 T55 坦克在...  \n",
       "484  在中文互联网，「百万年薪调音师」是一个流传多年的传说。但完成视频里这些神奇操作的并不是调音师...  \n",
       "485  想买一辆顶级跑车，一千万是起步价。而 2017 年全球最畅销的中国品牌车五菱宏光，顶配也不过...  \n",
       "486  这里是回形针，你的当代生活说明书。如果你喜欢我们的内容，也有兴趣和我们一起工作，那么不妨看看...  \n",
       "\n",
       "[487 rows x 4 columns]"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>477</th>\n",
       "      <td>如何拯救秃顶 | 回形针</td>\n",
       "      <td>2018-05-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在高水平的雄性激素影响下，几乎所有男性都会有不同程度的脱发。早在 2011 年，就有数据显示...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>478</th>\n",
       "      <td>坦克驾驶指南 | 回形针</td>\n",
       "      <td>2018-04-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>哪个男孩不想开坦克呢？在柏林郊区 Beerfelde，160 欧可以让你开着 T55 坦克在...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>479</th>\n",
       "      <td>混音师如何拯救歌手 | 回形针</td>\n",
       "      <td>2018-04-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在中文互联网，「百万年薪调音师」是一个流传多年的传说。但完成视频里这些神奇操作的并不是调音师...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>480</th>\n",
       "      <td>跑车凭什么这么贵 | 回形针</td>\n",
       "      <td>2018-04-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>想买一辆顶级跑车，一千万是起步价。而 2017 年全球最畅销的中国品牌车五菱宏光，顶配也不过...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>481</th>\n",
       "      <td>我们正在寻找动效设计师 | 回形针</td>\n",
       "      <td>2018-04-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>这里是回形针，你的当代生活说明书。如果你喜欢我们的内容，也有兴趣和我们一起工作，那么不妨看看...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>482</th>\n",
       "      <td>如何拯救秃顶 | 回形针</td>\n",
       "      <td>2018-05-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在高水平的雄性激素影响下，几乎所有男性都会有不同程度的脱发。早在 2011 年，就有数据显示...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>坦克驾驶指南 | 回形针</td>\n",
       "      <td>2018-04-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>哪个男孩不想开坦克呢？在柏林郊区 Beerfelde，160 欧可以让你开着 T55 坦克在...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>混音师如何拯救歌手 | 回形针</td>\n",
       "      <td>2018-04-11</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>在中文互联网，「百万年薪调音师」是一个流传多年的传说。但完成视频里这些神奇操作的并不是调音师...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>跑车凭什么这么贵 | 回形针</td>\n",
       "      <td>2018-04-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>想买一辆顶级跑车，一千万是起步价。而 2017 年全球最畅销的中国品牌车五菱宏光，顶配也不过...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>486</th>\n",
       "      <td>我们正在寻找动效设计师 | 回形针</td>\n",
       "      <td>2018-04-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...</td>\n",
       "      <td>这里是回形针，你的当代生活说明书。如果你喜欢我们的内容，也有兴趣和我们一起工作，那么不妨看看...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 title create_time  \\\n",
       "477       如何拯救秃顶 | 回形针  2018-05-09   \n",
       "478       坦克驾驶指南 | 回形针  2018-04-27   \n",
       "479    混音师如何拯救歌手 | 回形针  2018-04-11   \n",
       "480     跑车凭什么这么贵 | 回形针  2018-04-04   \n",
       "481  我们正在寻找动效设计师 | 回形针  2018-04-03   \n",
       "482       如何拯救秃顶 | 回形针  2018-05-09   \n",
       "483       坦克驾驶指南 | 回形针  2018-04-27   \n",
       "484    混音师如何拯救歌手 | 回形针  2018-04-11   \n",
       "485     跑车凭什么这么贵 | 回形针  2018-04-04   \n",
       "486  我们正在寻找动效设计师 | 回形针  2018-04-03   \n",
       "\n",
       "                                                  link  \\\n",
       "477  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "478  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "479  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "480  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "481  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "482  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "483  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "484  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "485  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "486  http://mp.weixin.qq.com/s?__biz=MzA3NDM1MjUwNg...   \n",
       "\n",
       "                                                  text  \n",
       "477  在高水平的雄性激素影响下，几乎所有男性都会有不同程度的脱发。早在 2011 年，就有数据显示...  \n",
       "478  哪个男孩不想开坦克呢？在柏林郊区 Beerfelde，160 欧可以让你开着 T55 坦克在...  \n",
       "479  在中文互联网，「百万年薪调音师」是一个流传多年的传说。但完成视频里这些神奇操作的并不是调音师...  \n",
       "480  想买一辆顶级跑车，一千万是起步价。而 2017 年全球最畅销的中国品牌车五菱宏光，顶配也不过...  \n",
       "481  这里是回形针，你的当代生活说明书。如果你喜欢我们的内容，也有兴趣和我们一起工作，那么不妨看看...  \n",
       "482  在高水平的雄性激素影响下，几乎所有男性都会有不同程度的脱发。早在 2011 年，就有数据显示...  \n",
       "483  哪个男孩不想开坦克呢？在柏林郊区 Beerfelde，160 欧可以让你开着 T55 坦克在...  \n",
       "484  在中文互联网，「百万年薪调音师」是一个流传多年的传说。但完成视频里这些神奇操作的并不是调音师...  \n",
       "485  想买一辆顶级跑车，一千万是起步价。而 2017 年全球最畅销的中国品牌车五菱宏光，顶配也不过...  \n",
       "486  这里是回形针，你的当代生活说明书。如果你喜欢我们的内容，也有兴趣和我们一起工作，那么不妨看看...  "
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#选取公众号最早更新的十篇文章\n",
    "df_url_out.tail(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 导出文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter('{公众号}文章.xlsx'.format(公众号=公众号),mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_url_out.to_excel(writer, sheet_name=公众号)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
